{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd \n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.model_selection import train_test_split, StratifiedKFold\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import warnings\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DATA_PATH = 'Data/'\n",
    "OUTPUT_PATH = 'Output/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "preds_df = pd.read_csv(DATA_PATH + 'Sample Submission.csv')\n",
    "oofs_df = pd.read_csv(DATA_PATH + 'train_labels.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oofs_df['lgb'] = pd.read_csv(OUTPUT_PATH + 'oof_lgbm_2.csv')['outcome_flag']\n",
    "oofs_df['lgb_180'] = pd.read_csv(OUTPUT_PATH + 'oof_lgbm_3.csv')['outcome_flag']\n",
    "oofs_df['xgb'] = pd.read_csv(OUTPUT_PATH + 'oofs_xgb.csv')['outcome_flag']\n",
    "oofs_df['xgb_180'] = pd.read_csv(OUTPUT_PATH + 'oofs_xgb_2.csv')['outcome_flag']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "preds_df['lgb'] = pd.read_excel(OUTPUT_PATH + 'preds_lgbm_2.xlsx')['outcome_flag']\n",
    "preds_df['lgb_180'] = pd.read_excel(OUTPUT_PATH + 'preds_lgbm_3.xlsx')['outcome_flag']\n",
    "preds_df['xgb'] = pd.read_excel(OUTPUT_PATH + 'preds_xgb.xlsx')['outcome_flag']\n",
    "preds_df['xgb_180'] = pd.read_excel(OUTPUT_PATH + 'preds_xgb_2.xlsx')['outcome_flag']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['patient_id', 'outcome_flag', 'lgb', 'lgb_180', 'xgb', 'xgb_180'], dtype='object')"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oofs_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ID_COL, TARGET_COL = 'patient_id', 'outcome_flag'\n",
    "features = [c for c in oofs_df.columns if c not in [ID_COL, TARGET_COL]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train, test = oofs_df, preds_df\n",
    "target = train[TARGET_COL]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "AUC: 0.8959155010318733\n",
      "fold n°1\n",
      "AUC: 0.8968155239623941\n",
      "fold n°2\n",
      "AUC: 0.868803026828709\n",
      "fold n°3\n",
      "AUC: 0.8692702361843614\n",
      "fold n°4\n",
      "AUC: 0.8659653749140106\n",
      "fold n°5\n",
      "AUC: 0.8591292134831461\n",
      "fold n°6\n",
      "AUC: 0.8713669023477277\n",
      "fold n°7\n",
      "AUC: 0.8716547815167679\n",
      "fold n°8\n",
      "AUC: 0.8898252364549611\n",
      "fold n°9\n",
      "AUC: 0.8618048915360068\n",
      "CV score: 0.87408 \n"
     ]
    }
   ],
   "source": [
    "max_iter = 10\n",
    "folds = StratifiedKFold(n_splits=10, random_state=1991)\n",
    "oof = np.zeros(len(train))\n",
    "predictions = np.zeros(len(test))\n",
    "\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    X_trn, y_trn = train.iloc[trn_idx][features], target.iloc[trn_idx]\n",
    "    X_val, y_val = train.iloc[val_idx][features], target.iloc[val_idx]\n",
    "    X_test = test[features]\n",
    "    clf = LogisticRegression(penalty='l1', C=20, class_weight={0:1, 1:0.01}, random_state=41)\n",
    "    clf.fit(X_trn, y_trn)\n",
    "    oof[val_idx] = clf.predict_proba(X_val)[:,1]\n",
    "    print(f\"AUC: {roc_auc_score(y_val, oof[val_idx])}\")\n",
    "    current_pred = clf.predict_proba(X_test)[:,1]\n",
    "    predictions += current_pred/min(max_iter, folds.n_splits)\n",
    "    \n",
    "print(\"CV score: {:<8.5f}\".format(roc_auc_score(target, oof)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Blend AUC: 0.8748597551026407\n"
     ]
    }
   ],
   "source": [
    "blend_oof = (oofs_df['lgb'] * 0.8 + oofs_df['lgb_180'] * 0.2) * 0.2 + (oofs_df['xgb'] * 0.7 + oofs_df['xgb_180']*0.3) * 0.8\n",
    "print(f\"Blend AUC: {roc_auc_score(target, blend_oof)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Net AUC: 0.8748590661501208\n"
     ]
    }
   ],
   "source": [
    "net_oof = blend_oof * 0.86 + oof * 0.14\n",
    "print(f\"Net AUC: {roc_auc_score(target, net_oof)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "blend_preds = (preds_df['lgb'] * 0.8 + preds_df['lgb_180'] * 0.2) * 0.2 + (preds_df['xgb'] * 0.7 + preds_df['xgb_180']*0.3) * 0.8\n",
    "net_preds = blend_preds * 0.86 + predictions * 0.14"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oof_df = pd.DataFrame()\n",
    "oof_df['patient_id'] = train['patient_id']\n",
    "oof_df[TARGET_COL] = net_oof\n",
    "oof_df.to_csv(OUTPUT_PATH + 'oof_ensemble_6.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>patient_id</th>\n",
       "      <th>outcome_flag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>patient_2</td>\n",
       "      <td>0.065141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>patient_3</td>\n",
       "      <td>0.148098</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>patient_5</td>\n",
       "      <td>0.517646</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>patient_8</td>\n",
       "      <td>0.005452</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>patient_14</td>\n",
       "      <td>0.389862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>patient_15</td>\n",
       "      <td>0.073515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>patient_16</td>\n",
       "      <td>0.008973</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>patient_33</td>\n",
       "      <td>0.138347</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>patient_38</td>\n",
       "      <td>0.016393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>patient_41</td>\n",
       "      <td>0.125466</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   patient_id  outcome_flag\n",
       "0   patient_2      0.065141\n",
       "1   patient_3      0.148098\n",
       "2   patient_5      0.517646\n",
       "3   patient_8      0.005452\n",
       "4  patient_14      0.389862\n",
       "5  patient_15      0.073515\n",
       "6  patient_16      0.008973\n",
       "7  patient_33      0.138347\n",
       "8  patient_38      0.016393\n",
       "9  patient_41      0.125466"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub_df = pd.DataFrame()\n",
    "sub_df[ID_COL] = test[ID_COL]\n",
    "sub_df[TARGET_COL] = net_preds\n",
    "sub_df[[TARGET_COL]].to_excel(OUTPUT_PATH + 'preds_ensemble_6.xlsx', index=False)\n",
    "sub_df.head(10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
